import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from scipy.stats import zscore
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
file_path = '/content/drive/MyDrive/EV_Population_WA_Data.csv'
df = pd.read_csv(file_path)
df.head()
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5YJ3E1EB0J | Thurston | Olympia | WA | 98512.0 | 2018 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 215 | 0 | 35.0 | 104823078 | POINT (-122.957046 46.991391) | PUGET SOUND ENERGY INC | 5.306701e+10 |
1 | WA1AAAGE9M | Kitsap | Port Orchard | WA | 98367.0 | 2021 | AUDI | E-TRON | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 222 | 0 | 35.0 | 156660507 | POINT (-122.6530052 47.4739066) | PUGET SOUND ENERGY INC | 5.303509e+10 |
2 | 5YJ3E1EA2J | Yakima | Yakima | WA | 98902.0 | 2018 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 215 | 0 | 14.0 | 269374108 | POINT (-120.530331 46.59534) | PACIFICORP | 5.307700e+10 |
3 | 5YJ3E1EA4N | Yakima | Yakima | WA | 98902.0 | 2022 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 15.0 | 213383894 | POINT (-120.530331 46.59534) | PACIFICORP | 5.307700e+10 |
4 | 7SAYGAEE2P | Snohomish | Bothell | WA | 98012.0 | 2023 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 1.0 | 229496046 | POINT (-122.206146 47.839957) | PUGET SOUND ENERGY INC | 5.306105e+10 |
df.tail()
VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
200043 | JTMAB3FVXR | Snohomish | Snohomish | WA | 98290.0 | 2024 | TOYOTA | RAV4 PRIME | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 42 | 0 | 44.0 | 262809249 | POINT (-122.0483457 47.9435765) | PUGET SOUND ENERGY INC | 5.306105e+10 |
200044 | 7FCTGAAA7P | Pierce | Orting | WA | 98360.0 | 2023 | RIVIAN | R1T | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 2.0 | 252195450 | POINT (-122.197791 47.0948565) | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) | 5.305307e+10 |
200045 | 1V2GNPE87P | Spokane | Spokane | WA | 99201.0 | 2023 | VOLKSWAGEN | ID.4 | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 3.0 | 227314790 | POINT (-117.428902 47.658268) | MODERN ELECTRIC WATER COMPANY | 5.306300e+10 |
200046 | 1G1RD6E42E | Snohomish | Mountlake Terrace | WA | 98043.0 | 2014 | CHEVROLET | VOLT | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 38 | 0 | 32.0 | 170747377 | POINT (-122.306706 47.792043) | PUGET SOUND ENERGY INC | 5.306105e+10 |
200047 | 5YJ3E1EAXP | Kitsap | Bremerton | WA | 98311.0 | 2023 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 23.0 | 224663134 | POINT (-122.636245 47.62806) | PUGET SOUND ENERGY INC | 5.303509e+10 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200048 entries, 0 to 200047 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIN (1-10) 200048 non-null object 1 County 200044 non-null object 2 City 200044 non-null object 3 State 200048 non-null object 4 Postal Code 200044 non-null float64 5 Model Year 200048 non-null int64 6 Make 200048 non-null object 7 Model 200048 non-null object 8 Electric Vehicle Type 200048 non-null object 9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 200048 non-null object 10 Electric Range 200048 non-null int64 11 Base MSRP 200048 non-null int64 12 Legislative District 199606 non-null float64 13 DOL Vehicle ID 200048 non-null int64 14 Vehicle Location 200040 non-null object 15 Electric Utility 200044 non-null object 16 2020 Census Tract 200044 non-null float64 dtypes: float64(3), int64(4), object(10) memory usage: 25.9+ MB
df.describe()
Postal Code | Model Year | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | 2020 Census Tract | |
---|---|---|---|---|---|---|---|
count | 200044.000000 | 200048.000000 | 200048.000000 | 200048.000000 | 199606.000000 | 2.000480e+05 | 2.000440e+05 |
mean | 98176.178121 | 2020.871261 | 53.485279 | 947.551913 | 28.986063 | 2.262988e+08 | 5.297545e+10 |
std | 2424.207811 | 2.994933 | 88.786301 | 7860.591091 | 14.908108 | 7.282433e+07 | 1.605628e+09 |
min | 1731.000000 | 1997.000000 | 0.000000 | 0.000000 | 1.000000 | 4.385000e+03 | 1.001020e+09 |
25% | 98052.000000 | 2019.000000 | 0.000000 | 0.000000 | 17.000000 | 1.904573e+08 | 5.303301e+10 |
50% | 98125.000000 | 2022.000000 | 0.000000 | 0.000000 | 33.000000 | 2.363396e+08 | 5.303303e+10 |
75% | 98372.000000 | 2023.000000 | 53.000000 | 0.000000 | 42.000000 | 2.609659e+08 | 5.305307e+10 |
max | 99577.000000 | 2025.000000 | 337.000000 | 845000.000000 | 49.000000 | 4.792548e+08 | 5.602100e+10 |
df.shape
(200048, 17)
df.isnull().sum()
0 | |
---|---|
VIN (1-10) | 0 |
County | 4 |
City | 4 |
State | 0 |
Postal Code | 4 |
Model Year | 0 |
Make | 0 |
Model | 0 |
Electric Vehicle Type | 0 |
Clean Alternative Fuel Vehicle (CAFV) Eligibility | 0 |
Electric Range | 0 |
Base MSRP | 0 |
Legislative District | 442 |
DOL Vehicle ID | 0 |
Vehicle Location | 8 |
Electric Utility | 4 |
2020 Census Tract | 4 |
# # Drop rows with missing categorical values
# df = df.dropna(subset=['County', 'City', 'Model', 'Electric Utility'])
# # Check if categorical missing values are removed
# df.isnull().sum()
# Fill missing numerical values with the median
# df.loc[:, 'Electric Range'] = df['Electric Range'].fillna(df['Electric Range'].median())
df.loc[:, 'Base MSRP'] = df['Base MSRP'].fillna(df['Base MSRP'].mean())
df.loc[:, 'Legislative District'] = df['Legislative District'].fillna(df['Legislative District'].median())
# Verify that all missing values are handled
df.isnull().sum()
0 | |
---|---|
VIN (1-10) | 0 |
County | 4 |
City | 4 |
State | 0 |
Postal Code | 4 |
Model Year | 0 |
Make | 0 |
Model | 0 |
Electric Vehicle Type | 0 |
Clean Alternative Fuel Vehicle (CAFV) Eligibility | 0 |
Electric Range | 0 |
Base MSRP | 0 |
Legislative District | 0 |
DOL Vehicle ID | 0 |
Vehicle Location | 8 |
Electric Utility | 4 |
2020 Census Tract | 4 |
df['County'].fillna(df['County'].mode()[0], inplace=True)
df['City'].fillna(df['City'].mode()[0], inplace=True)
df['Postal Code'].fillna(df['Postal Code'].mode()[0], inplace=True)
df['Electric Utility'].fillna(df['Electric Utility'].mode()[0], inplace=True)
df.dropna(subset=['2020 Census Tract'], inplace=True)
df.dropna(subset=['Vehicle Location'], inplace=True)
df.dropna(subset=['Legislative District'], inplace=True)
print("Missing values after handling:")
print(df.isnull().sum())
Missing values after handling: VIN (1-10) 0 County 0 City 0 State 0 Postal Code 0 Model Year 0 Make 0 Model 0 Electric Vehicle Type 0 Clean Alternative Fuel Vehicle (CAFV) Eligibility 0 Electric Range 0 Base MSRP 0 Legislative District 0 DOL Vehicle ID 0 Vehicle Location 0 Electric Utility 0 2020 Census Tract 0 dtype: int64
# Summary statistics for key numerical variables
summary_stats = df[['Electric Range', 'Base MSRP', 'Model Year']].describe().T
summary_stats['variance'] = df[['Electric Range', 'Base MSRP', 'Model Year']].var()
# Display the statistics
summary_stats[['mean', '50%', 'std', 'variance']] # 50% represents the median
mean | 50% | std | variance | |
---|---|---|---|---|
Electric Range | 53.484268 | 0.0 | 88.786478 | 7.883039e+03 |
Base MSRP | 947.161643 | 0.0 | 7859.560065 | 6.177268e+07 |
Model Year | 2020.871336 | 2022.0 | 2.994938 | 8.969653e+00 |
plt.figure(figsize=(15, 6))
# Histogram for Electric Range
plt.subplot(1, 2, 1)
sns.histplot(df['Electric Range'], bins=30, kde=True)
plt.title('Distribution of Electric Range')
plt.xlabel('Electric Range (miles)')
plt.ylabel('Frequency')
# Box Plot for Electric Range
plt.subplot(1, 2, 2)
sns.boxplot(df['Electric Range'])
plt.title('Box Plot of Electric Range')
plt.xlabel('Electric Range (miles)')
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 6))
# Histogram for Base MSRP
plt.subplot(1, 2, 1)
sns.histplot(df['Base MSRP'], bins=30, kde=True)
plt.title('Distribution of Base MSRP')
plt.xlabel('Base MSRP (in dollars)')
plt.ylabel('Frequency')
# Box Plot for Base MSRP
plt.subplot(1, 2, 2)
sns.boxplot(df['Base MSRP'])
plt.title('Box Plot of Base MSRP')
plt.xlabel('Base MSRP (in dollars)')
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 6))
# Histogram for Model Year
plt.subplot(1, 2, 1)
sns.histplot(df['Model Year'], bins=30, kde=True)
plt.title('Distribution of Model Year')
plt.xlabel('Model Year')
plt.ylabel('Frequency')
# Box Plot for Model Year
plt.subplot(1, 2, 2)
sns.boxplot(df['Model Year'])
plt.title('Box Plot of Model Year')
plt.xlabel('Model Year')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.countplot(y='Make', data=df, order=df['Make'].value_counts().index[:10])
plt.title('Top 10 Car Makers')
plt.xlabel('Count')
plt.ylabel('Car Maker')
plt.show()
unique_models = df[['Make', 'Model']].drop_duplicates()
model_counts = unique_models['Make'].value_counts().reset_index()
model_counts.columns = ['Make', 'Unique Models']
# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Unique Models', y='Make', data=model_counts, palette='viridis')
plt.title('Unique Models per Maker')
plt.xlabel('Number of Unique Models')
plt.ylabel('Maker')
plt.show()
print(model_counts)
Make Unique Models 0 BMW 15 1 AUDI 12 2 MERCEDES-BENZ 11 3 HYUNDAI 9 4 KIA 8 5 FORD 8 6 CHEVROLET 8 7 VOLVO 7 8 TESLA 6 9 TOYOTA 5 10 PORSCHE 4 11 HONDA 4 12 LEXUS 4 13 SMART 3 14 MAZDA 3 15 CADILLAC 3 16 GENESIS 3 17 RIVIAN 3 18 LAND ROVER 2 19 FISKER 2 20 BENTLEY 2 21 MINI 2 22 LINCOLN 2 23 SUBARU 2 24 MITSUBISHI 2 25 FIAT 2 26 NISSAN 2 27 VOLKSWAGEN 2 28 JEEP 2 29 CHRYSLER 1 30 JAGUAR 1 31 LUCID 1 32 ALFA ROMEO 1 33 DODGE 1 34 GMC 1 35 ACURA 1 36 POLESTAR 1 37 AZURE DYNAMICS 1 38 TH!NK 1 39 WHEEGO ELECTRIC CARS 1 40 ROLLS-ROYCE 1 41 RAM 1
unique_models = df[['Make', 'Model']].drop_duplicates()
# Group by Maker and aggregate model names using 'list'
model_counts = unique_models.groupby('Make')['Model'].agg(list).reset_index()
model_counts['Count'] = model_counts['Model'].str.len()
# Get top 5 makers with the most unique models
top_makers = model_counts.sort_values(by='Count', ascending=False).head(5)
# Display the result
print(top_makers)
print(model_counts.sort_values(by='Count', ascending=False))
Make Model Count 5 BMW [I3, X3, X5, I4, 330E, IX, 530E, I5, I7, I8, 7... 15 2 AUDI [E-TRON, A3, Q5, Q8, Q5 E, Q4, E-TRON GT, E-TR... 12 25 MERCEDES-BENZ [EQE-CLASS SUV, EQS-CLASS SUV, EQS-CLASS SEDAN... 11 16 HYUNDAI [IONIQ, IONIQ 5, KONA ELECTRIC, SANTA FE, IONI... 9 12 FORD [ESCAPE, C-MAX, FOCUS, FUSION, MUSTANG MACH-E,... 8 Make Model \ 5 BMW [I3, X3, X5, I4, 330E, IX, 530E, I5, I7, I8, 7... 2 AUDI [E-TRON, A3, Q5, Q8, Q5 E, Q4, E-TRON GT, E-TR... 25 MERCEDES-BENZ [EQE-CLASS SUV, EQS-CLASS SUV, EQS-CLASS SEDAN... 16 HYUNDAI [IONIQ, IONIQ 5, KONA ELECTRIC, SANTA FE, IONI... 12 FORD [ESCAPE, C-MAX, FOCUS, FUSION, MUSTANG MACH-E,... 19 KIA [SOUL, SPORTAGE, NIRO, SORENTO, EV6, EV9, OPTI... 7 CHEVROLET [VOLT, BOLT EV, BOLT EUV, SPARK, BLAZER EV, SI... 40 VOLVO [XC60, XC90, S60, XC40, C40, V60, S90] 36 TESLA [MODEL 3, MODEL Y, MODEL S, MODEL X, CYBERTRUC... 38 TOYOTA [PRIUS PLUG-IN, PRIUS PRIME, RAV4 PRIME, BZ4X,... 21 LEXUS [RX, NX, RZ, TX] 15 HONDA [CLARITY, PROLOGUE, ACCORD, CR-V] 30 PORSCHE [TAYCAN, PANAMERA, CAYENNE, 918] 34 SMART [FORTWO ELECTRIC DRIVE, EQ FORTWO, FORTWO] 32 RIVIAN [R1T, R1S, EDV] 6 CADILLAC [LYRIQ, ELR, CT6] 24 MAZDA [CX-90, CX-70, MX-30] 13 GENESIS [GV60, GV70, G80] 28 NISSAN [LEAF, ARIYA] 27 MITSUBISHI [OUTLANDER, I-MIEV] 26 MINI [HARDTOP, COUNTRYMAN] 4 BENTLEY [FLYING SPUR, BENTAYGA] 11 FISKER [OCEAN, KARMA] 22 LINCOLN [CORSAIR, AVIATOR] 20 LAND ROVER [RANGE ROVER SPORT, RANGE ROVER] 18 JEEP [GRAND CHEROKEE, WRANGLER] 39 VOLKSWAGEN [E-GOLF, ID.4] 10 FIAT [500, 500E] 35 SUBARU [CROSSTREK, SOLTERRA] 37 TH!NK [CITY] 33 ROLLS-ROYCE [SPECTRE] 0 ACURA [ZDX] 31 RAM [PROMASTER 3500] 29 POLESTAR [PS2] 23 LUCID [AIR] 1 ALFA ROMEO [TONALE] 17 JAGUAR [I-PACE] 14 GMC [HUMMER EV PICKUP] 9 DODGE [HORNET] 8 CHRYSLER [PACIFICA] 3 AZURE DYNAMICS [TRANSIT CONNECT ELECTRIC] 41 WHEEGO ELECTRIC CARS [WHEEGO] Count 5 15 2 12 25 11 16 9 12 8 19 8 7 8 40 7 36 6 38 5 21 4 15 4 30 4 34 3 32 3 6 3 24 3 13 3 28 2 27 2 26 2 4 2 11 2 22 2 20 2 18 2 39 2 10 2 35 2 37 1 33 1 0 1 31 1 29 1 23 1 1 1 17 1 14 1 9 1 8 1 3 1 41 1
plt.figure(figsize=(10,6))
sns.boxplot(x=df['Postal Code'])
plt.show()
z = np.abs(zscore(df['Postal Code']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers before handling: 322
upper_limit = df['Postal Code'].quantile(0.95)
lower_limit = df['Postal Code'].quantile(0.05)
df['Postal Code'] = np.clip(df['Postal Code'], lower_limit, upper_limit)
z = np.abs(zscore(df['Postal Code']))
print("Number of outliers after handling:", (z > 3).sum())
Number of outliers after handling: 0
plt.figure(figsize=(10,6))
sns.boxplot(x=df['Model Year'])
plt.show()
z = np.abs(zscore(df['Model Year']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers before handling: 781
upper_limit = df['Model Year'].quantile(0.95)
lower_limit = df['Model Year'].quantile(0.05)
df['Model Year'] = np.clip(df['Model Year'], lower_limit, upper_limit)
z = np.abs(zscore(df['Model Year']))
print("Number of outliers after handling:", (z > 3).sum())
Number of outliers after handling: 0
# plt.figure(figsize=(10,6))
# sns.boxplot(x=df['Base MSRP'])
# plt.show()
# z = np.abs(zscore(df['Base MSRP']))
# df_outliers = df[z > 3]
# print("Number of outliers before handling:", (z > 3).sum())
# upper_limit = df['Base MSRP'].quantile(0.95)
# lower_limit = df['Base MSRP'].quantile(0.05)
# df['Base MSRP'] = np.clip(df['Base MSRP'], lower_limit, upper_limit)
# z = np.abs(zscore(df['Base MSRP']))
# print("Number of outliers after handling:", (z > 3).sum())
# plt.figure(figsize=(10,6))
# sns.boxplot(x=df['Legislative District'])
# plt.show()
# z = np.abs(zscore(df['Legislative District']))
# df_outliers = df[z > 3]
# print("Number of outliers before handling:", (z > 3).sum())
plt.figure(figsize=(10,6))
sns.boxplot(x=df['DOL Vehicle ID'])
plt.show()
z = np.abs(zscore(df['DOL Vehicle ID']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers before handling: 9120
upper_limit = df['DOL Vehicle ID'].quantile(0.95)
lower_limit = df['DOL Vehicle ID'].quantile(0.05)
df['DOL Vehicle ID'] = np.clip(df['DOL Vehicle ID'], lower_limit, upper_limit)
z = np.abs(zscore(df['DOL Vehicle ID']))
print("Number of outliers after handling:", (z > 3).sum())
plt.figure(figsize=(10,6))
sns.boxplot(x=df['2020 Census Tract'])
plt.show()
z = np.abs(zscore(df['2020 Census Tract']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers after handling: 0
Number of outliers before handling: 360
print("Data types before correction:")
print(df.dtypes)
Data types before correction: VIN (1-10) object County object City object State object Postal Code float64 Model Year int64 Make object Model object Electric Vehicle Type object Clean Alternative Fuel Vehicle (CAFV) Eligibility object Electric Range int64 Base MSRP int64 Legislative District float64 DOL Vehicle ID float64 Vehicle Location object Electric Utility object 2020 Census Tract float64 dtype: object
df['Postal Code'] = df['Postal Code'].astype('category')
df['Legislative District'] = df['Legislative District'].astype('category')
df['2020 Census Tract'] = df['2020 Census Tract'].astype('category')
df['County'] = df['County'].astype('category')
df['City'] = df['City'].astype('category')
df['State'] = df['State'].astype('category')
df['Make'] = df['Make'].astype('category')
df['Model'] = df['Model'].astype('category')
df['Electric Vehicle Type'] = df['Electric Vehicle Type'].astype('category')
print("Data types after correction:")
print(df.dtypes)
Data types after correction: VIN (1-10) object County category City category State category Postal Code category Model Year int64 Make category Model category Electric Vehicle Type category Clean Alternative Fuel Vehicle (CAFV) Eligibility object Electric Range int64 Base MSRP int64 Legislative District category DOL Vehicle ID float64 Vehicle Location object Electric Utility object 2020 Census Tract category dtype: object
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows before removal:", duplicate_rows.shape[0])
df.drop_duplicates(inplace=True)
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows after removal:", duplicate_rows.shape[0])
Number of duplicate rows before removal: 411 Number of duplicate rows after removal: 0
df.rename(columns = {
'VIN (1-10)': 'VIN',
'Postal Code': 'ZIP',
'Model Year': 'Year',
'Electric Vehicle Type': 'Type',
'Clean Alternative Fuel Vehicle (CAFV) Eligibility': 'CAFV',
'Base MSRP': 'MSRP',
'Electric Range': 'Range',
'DOL Vehicle ID': 'DOL'
}, inplace=True)
df.head(5)
VIN | County | City | State | ZIP | Year | Make | Model | Type | CAFV | Range | MSRP | Legislative District | DOL | Vehicle Location | Electric Utility | 2020 Census Tract | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5YJ3E1EB0J | Thurston | Olympia | WA | 98512.0 | 2018 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 215 | 0 | 35.0 | 1.127669e+08 | POINT (-122.957046 46.991391) | PUGET SOUND ENERGY INC | 5.306701e+10 |
1 | WA1AAAGE9M | Kitsap | Port Orchard | WA | 98367.0 | 2021 | AUDI | E-TRON | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 222 | 0 | 35.0 | 1.566605e+08 | POINT (-122.6530052 47.4739066) | PUGET SOUND ENERGY INC | 5.303509e+10 |
2 | 5YJ3E1EA2J | Yakima | Yakima | WA | 98902.0 | 2018 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 215 | 0 | 14.0 | 2.693741e+08 | POINT (-120.530331 46.59534) | PACIFICORP | 5.307700e+10 |
3 | 5YJ3E1EA4N | Yakima | Yakima | WA | 98902.0 | 2022 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 15.0 | 2.133839e+08 | POINT (-120.530331 46.59534) | PACIFICORP | 5.307700e+10 |
4 | 7SAYGAEE2P | Snohomish | Bothell | WA | 98012.0 | 2023 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 1.0 | 2.294960e+08 | POINT (-122.206146 47.839957) | PUGET SOUND ENERGY INC | 5.306105e+10 |
df.loc[df['Model'] == 'S-CLASS', 'Range'] = df.loc[df['Model'] == 'S-CLASS', 'Range'].fillna(46.0)
df.loc[df['Model'] == 'S-CLASS', 'MSRP'] = df.loc[df['Model'] == 'S-CLASS', 'MSRP'].fillna(126855.0)
df.groupby('County')['Model'].nunique().idxmax()
county_with_most_unique_models = df.groupby('County')['Model'].nunique().idxmax()
# Get the most frequent model in the county with the most unique models
most_frequent_model = df[df['County'] == county_with_most_unique_models]['Model'].mode()[0]
print(most_frequent_model)
adoption_counts = df.groupby(['Year', 'Make'])['VIN'].nunique().reset_index()
pivot_data = adoption_counts.pivot(index='Year', columns='Make', values='VIN').fillna(0)
pivot_data.head(5)
plt.figure(figsize=(8, 6))
sns.lineplot(data=pivot_data, marker='o')
plt.title('EV Adoption Rate Among Car Makers over the years')
plt.xlabel('Year')
plt.ylabel('Number of Unique VINs')
plt.legend(title='Car Maker', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(pivot_data.index, rotation=45)
plt.show()
MODEL Y
import plotly.express as px
fig = px.line(adoption_counts, x='Year', y='VIN', color='Make', title = 'EV Adoption Rate Among Car Makers over the years', labels= {'VIN': 'Number of Unique VINs'}, markers=True)
# fig.update_layout(xaxis_title='Year', yaxis_title='Number of Unique VINs')
fig.show()
city_counts = df.groupby('City')['VIN'].nunique().reset_index().sort_values(by='VIN', ascending=False).head(5)
plt.figure(figsize=(8, 6))
plt.bar(city_counts['City'], city_counts['VIN'], color='skyblue')
plt.xlabel('City')
plt.ylabel('Number of Unique VINs')
plt.title('Number of Unique VINs per City')
plt.xticks(rotation=45)
plt.show()
top_makers = df.groupby('Make')['VIN'].nunique().sort_values(ascending=False).head(5)
plt.figure(figsize=(10, 6))
plt.bar(top_makers.index, top_makers.values, color='skyblue')
plt.xlabel('Car Maker')
plt.ylabel('Number of Unique VINs')
plt.title('Top 5 Car Makers Adopting EVs in Washington')
plt.xticks(rotation=45)
plt.show()
crosstab_make_city = pd.crosstab(df['Make'], df['City'], margins=True)
top_crosstab_make_city = crosstab_make_city.sum(axis=1).nlargest(10).index
top_make_city_data = crosstab_make_city.loc[top_crosstab_make_city]
top_make_city_data
City | Aberdeen | Aberdeen Proving Ground | Acme | Addy | Adna | Airway Heights | Alameda | Albion | Alderdale | Alderwood Manor | ... | Woodland | Woodway | Worley | Yacolt | Yakima | Yarrow Point | Yelm | Yorktown | Zillah | All |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Make | |||||||||||||||||||||
All | 184 | 1 | 10 | 2 | 1 | 35 | 2 | 1 | 1 | 1 | ... | 212 | 95 | 1 | 65 | 788 | 159 | 329 | 1 | 39 | 199629 |
TESLA | 47 | 1 | 2 | 2 | 0 | 11 | 0 | 0 | 0 | 0 | ... | 96 | 42 | 0 | 28 | 328 | 94 | 132 | 1 | 17 | 87781 |
CHEVROLET | 25 | 0 | 1 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | ... | 18 | 0 | 0 | 4 | 79 | 4 | 30 | 0 | 1 | 14773 |
NISSAN | 13 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 1 | ... | 12 | 5 | 0 | 6 | 30 | 1 | 21 | 0 | 2 | 14387 |
FORD | 24 | 0 | 2 | 0 | 0 | 5 | 1 | 0 | 0 | 0 | ... | 11 | 2 | 0 | 5 | 36 | 3 | 34 | 0 | 2 | 10543 |
KIA | 18 | 0 | 2 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | ... | 6 | 5 | 0 | 2 | 42 | 3 | 21 | 0 | 5 | 8760 |
BMW | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 7 | 7 | 0 | 4 | 26 | 20 | 6 | 0 | 2 | 8294 |
TOYOTA | 11 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 18 | 1 | 0 | 3 | 64 | 2 | 15 | 0 | 0 | 7668 |
VOLKSWAGEN | 3 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 9 | 1 | 0 | 1 | 17 | 1 | 6 | 0 | 2 | 5559 |
HYUNDAI | 7 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 5 | 2 | 0 | 1 | 12 | 2 | 9 | 0 | 0 | 5475 |
10 rows × 764 columns
crosstab_make_model = pd.crosstab(df['Make'], df['Model'])
crosstab_make_model
Model | 330E | 500 | 500E | 530E | 740E | 745E | 745LE | 750E | 918 | A3 | ... | VOLT | WHEEGO | WRANGLER | X3 | X5 | XC40 | XC60 | XC90 | XM | ZDX |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Make | |||||||||||||||||||||
ACURA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 |
ALFA ROMEO | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
AUDI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 562 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
AZURE DYNAMICS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
BENTLEY | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
BMW | 491 | 0 | 0 | 421 | 29 | 11 | 2 | 2 | 0 | 0 | ... | 0 | 0 | 0 | 317 | 2541 | 0 | 0 | 0 | 13 | 0 |
CADILLAC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CHEVROLET | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4828 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CHRYSLER | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
DODGE | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
FIAT | 0 | 786 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
FISKER | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
FORD | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
GENESIS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
GMC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
HONDA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
HYUNDAI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
JAGUAR | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
JEEP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 3946 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
KIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
LAND ROVER | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
LEXUS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
LINCOLN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
LUCID | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
MAZDA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
MERCEDES-BENZ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
MINI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
MITSUBISHI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
NISSAN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
POLESTAR | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
PORSCHE | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
RAM | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
RIVIAN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ROLLS-ROYCE | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SMART | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
SUBARU | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
TESLA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
TH!NK | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
TOYOTA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
VOLKSWAGEN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
VOLVO | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1080 | 1269 | 1644 | 0 | 0 |
WHEEGO ELECTRIC CARS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
42 rows × 151 columns
top_crosstab_make_model = crosstab_make_model.sum(axis=1).nlargest(10).index
top_make_model_data = crosstab_make_model.loc[top_crosstab_make_model]
top_make_model_data
Model | 330E | 500 | 500E | 530E | 740E | 745E | 745LE | 750E | 918 | A3 | ... | VOLT | WHEEGO | WRANGLER | X3 | X5 | XC40 | XC60 | XC90 | XM | ZDX |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Make | |||||||||||||||||||||
TESLA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CHEVROLET | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4828 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
NISSAN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
FORD | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
KIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
BMW | 491 | 0 | 0 | 421 | 29 | 11 | 2 | 2 | 0 | 0 | ... | 0 | 0 | 0 | 317 | 2541 | 0 | 0 | 0 | 13 | 0 |
TOYOTA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
VOLKSWAGEN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
HYUNDAI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
JEEP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 3946 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 rows × 151 columns
top_makes = crosstab_make_city.sum(axis=1).nlargest(10).index
top_makes_data = crosstab_make_city.loc[top_makes]
plot_data = []
for make in top_makes:
top_cities = top_makes_data.loc[make].nlargest(5)
for city, count in top_cities.items():
plot_data.append({'Make': make, 'City': city, 'Count': count})
plot_df = pd.DataFrame(plot_data)
fig = px.bar(plot_df, x='Make', y='Count', color='City', title = 'Top 10 Vehicle Makes and Their Top 5 Cities', labels = {'Count': 'Number of Vehicles'}, barmode='group')
fig.show()
fig, ax = plt.subplots(figsize=(12, 6))
colors = plt.cm.tab10(np.arange(len(plot_df['City'].unique())))
for i, city in enumerate(plot_df['City'].unique()):
city_data = plot_df[plot_df['City'] == city]
ax.bar(city_data['Make'], city_data['Count'], label=city, color=colors[i])
ax.set_xlabel('Make')
ax.set_ylabel('Count')
ax.set_title('Top 10 Vehicle Makes and Their Top 5 Cities')
ax.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()
df['MSRP'].isnull().sum()
0
df['MSRP'].describe()
MSRP | |
---|---|
count | 199629.000000 |
mean | 949.111677 |
std | 7867.529029 |
min | 0.000000 |
25% | 0.000000 |
50% | 0.000000 |
75% | 0.000000 |
max | 845000.000000 |
mean_msrp = df['MSRP'].replace(0.0, pd.NA).mean()
df['MSRP'] = df['MSRP'].replace(0.0, mean_msrp)
df['MSRP'].describe()
MSRP | |
---|---|
count | 199629.000000 |
mean | 57086.536607 |
std | 2935.483026 |
min | 31950.000000 |
25% | 57086.536607 |
50% | 57086.536607 |
75% | 57086.536607 |
max | 845000.000000 |
numerical_df = df.select_dtypes(include=['int64', 'float64'])
numerical_correlation_matrix = numerical_df.corr()
plt.figure(figsize=(8, 4))
sns.heatmap(numerical_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Numerical Data Correlation Heatmap')
plt.show()
pearson_correlation_matrix = numerical_df.corr(method='pearson')
pearson_correlation_matrix
plt.figure(figsize=(8, 4))
sns.heatmap(pearson_correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Pearson Correlation Matrix')
plt.show()
spearman_correlation_matrix = numerical_df.corr(method='spearman')
spearman_correlation_matrix
plt.figure(figsize=(8, 4))
sns.heatmap(spearman_correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Spearman Correlation Matrix')
plt.show()
crosstab_full = pd.crosstab(df['County'], df['Make'])
# Filter top 3 counties and top 3 makes
top_counties = crosstab_full.sum(axis=1).nlargest(10).index
top_makes = crosstab_full.sum(axis=0).nlargest(10).index
crosstab_filtered = crosstab_full.loc[top_counties, top_makes]
print(crosstab_filtered)
Make TESLA CHEVROLET NISSAN FORD KIA BMW TOYOTA VOLKSWAGEN \ County King 48471 6012 6806 3987 3973 4932 2926 2731 Snohomish 12248 1423 1727 1259 1002 818 709 576 Pierce 6565 1252 1116 1137 826 617 592 391 Clark 4863 929 858 682 527 450 871 319 Thurston 2350 1025 593 723 467 168 375 329 Kitsap 2149 641 660 419 405 280 279 216 Spokane 1942 458 309 395 277 223 232 145 Whatcom 1455 499 657 280 288 120 278 234 Benton 1016 258 134 205 111 112 165 45 Skagit 767 264 169 168 120 55 116 82 Make HYUNDAI JEEP County King 2777 2449 Snohomish 660 498 Pierce 389 380 Clark 366 399 Thurston 179 164 Kitsap 245 200 Spokane 131 308 Whatcom 149 118 Benton 66 60 Skagit 96 67
top_10_cars = df[['Make', 'Model', 'Range']].nlargest(10, 'Range')
df['Make'] = df['Make'].astype(str)
df['Model'] = df['Model'].astype(str)
unique_models = df.groupby(['Make', 'Model'])['Range'].max().reset_index()
# Get the top 10 unique models with the highest range
top_10_models = unique_models.nlargest(10, 'Range')
# Create a combined column for better labeling in the plot
top_10_models['Car'] = top_10_models['Make'] + ' ' + top_10_models['Model']
# Plotting
plt.figure(figsize=(6, 4))
sns.barplot(data=top_10_models, x='Range', y='Car', palette='viridis')
plt.title('Top 10 Unique Car Models with Highest Range')
plt.xlabel('Range')
plt.ylabel('Car (Make and Model)')
plt.grid(True)
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.boxplot(data=df, x='Make', y='Range', ax=ax[0])
ax[0].set_title('Range Distribution by Make')
ax[0].set_xlabel('Make')
ax[0].set_ylabel('Range (in Miles)')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45, ha='right')
sns.violinplot(data=df, x='Make', y='Range', ax=ax[1], bw='silverman')
ax[1].set_title('Range Distribution by Make (Violin Plot)')
ax[1].set_xlabel('Make')
ax[1].set_ylabel('Range (in Miles)')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()
plt.show()
label_encoder = LabelEncoder()
df['City_Numeric'] = label_encoder.fit_transform(df['City'])
# Calculate the correlation
correlation = df['Range'].corr(df['City_Numeric'])
# Display the correlation value
print(f"Correlation between Electric Range and City: {correlation}")
# Optional: Visualize the relationship
plt.figure(figsize=(6, 4))
sns.scatterplot(x='City_Numeric', y='Range', data=df)
plt.title('Range vs. City')
plt.xlabel('City (Encoded)')
plt.ylabel('Range')
plt.xticks(ticks=range(len(label_encoder.classes_)), labels=label_encoder.classes_, rotation=45)
plt.tight_layout()
plt.show()
Correlation between Electric Range and City: -0.00035958723385506424
avg_range = df.groupby('Make')['Range'].mean()
plt.figure(figsize=(10, 6))
avg_range.plot(kind='bar', color='skyblue')
plt.title('Average Electrical Range by Car Make')
plt.xlabel('Make')
plt.ylabel('Average Electrical Range (miles)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Type', y='Range', palette='Set2')
plt.title('Electrical Range Distribution by Electric Vehicle Type')
plt.xlabel('Electric Vehicle Type')
plt.ylabel('Electrical Range (miles)')
plt.tight_layout()
plt.show()
# Count plot
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='County', hue='Type', palette='pastel')
plt.title('Count of Electric Vehicles by Country and Type')
plt.xlabel('County')
plt.xticks(rotation=45, ha='right', fontsize='3')
plt.ylabel('Count')
plt.legend(title='Electric Vehicle Type')
plt.tight_layout()
plt.show()